IMPORT

Librairies

library(phyloseq) # for phyloseq object
library(Biostrings)
library(ggplot2)
library(cowplot)
library(tidyverse)
library("plotly") # plot 3D
library("microbiome") # for centered log-ratio
library("coda") # Aitchison distance
library("coda.base") # Aitchison distance
library("vegan") # NMDS
library(pheatmap) # for heatmap

Data

prune_samples(sample_sums(physeq.agp)>=500, physeq.agp)
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 6391 taxa and 1190 samples ]
sample_data() Sample Data:       [ 1190 samples by 25 sample variables ]
tax_table()   Taxonomy Table:    [ 6391 taxa by 7 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 6391 tips and 6389 internal nodes ]
refseq()      DNAStringSet:      [ 6391 reference sequences ]

Phylogenetic tree was computed with the package phangorn, and the script was run on a cluster. Let’s check we have correctly generated a phylogenetic tree.

# Look at the tree
plot_tree(physeq.agp, color = "host_disease", ladderize="left")

ABUNDANCES

1. Absolute abundances

# Plot Phylum
plot_bar(physeq.agp, fill = "Phylum") + facet_wrap("host_disease", scales="free") +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Absolute abundance", title = "AGP dataset")

Sequencing depth characteristics of the Zhuang dataset:
- minimum of 578 total count per sample
- median: 2.328410^{4} total count per sample
- maximum of 4.5417410^{5} total count per sample

2. Relative abundances

3. Remove bloom sequences

Looking at it more closely, there is a high proportion of Proteobacteria compared to other datasets. Considering that fecal samples were shipped from the participants’ home at room temperature, there is evidence that some bacteria grow at room temperature, which would bias the microbial composition (https://journals.asm.org/doi/10.1128/mSystems.00199-16#B5). Authors from the AGP removed a selection of bloom sequences that were shared on their github repository in a FASTA file.

3. Firmicutes/Bacteroidota ratio

NORMALIZE DATA

# Sanity check no sample with less than 500 total count
table(sample_sums(physeq.filt)<500) # all FALSE

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH NON-ZERO COMPOSITIONS
physeq.NZcomp <- physeq.filt
otu_table(physeq.NZcomp)[otu_table(physeq.NZcomp) == 0] <- 0.5 # pseudocounts

# Sanity check that 0 values have been replaced
# otu_table(physeq.filt)[1:5,1:5]
# otu_table(physeq.NZcomp)[1:5,1:5]

# transform into compositions
physeq.NZcomp <- transform_sample_counts(physeq.NZcomp, function(x) x / sum(x) )
table(rowSums(otu_table(physeq.NZcomp))) # check if there is any row not summing to 1

# Save object
saveRDS(physeq.NZcomp, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_NZcomp.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH RELATIVE COUNT (BETWEEN 0 AND 1)
physeq.rel <- physeq.filt
physeq.rel <- transform_sample_counts(physeq.rel, function(x) x / sum(x) ) # divide each count by the total number of counts (per sample)

# check the counts are all relative
# otu_table(physeq.filt)[1:5, 1:5]
# otu_table(physeq.rel)[1:5, 1:5]

# sanity check
table(rowSums(otu_table(physeq.rel))) # check if there is any row not summing to 1

# save the physeq.rel object
saveRDS(physeq.rel, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_relative.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH COMMON-SCALE NORMALIZATION
physeq.CSN <- physeq.filt
physeq.CSN <- transform_sample_counts(physeq.CSN, function(x) (x*min(sample_sums(physeq.CSN))) / sum(x) )

# sanity check
table(rowSums(otu_table(physeq.CSN))) # check that all rows are summing to the same total

# save the physeq.CSN object
saveRDS(physeq.CSN, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_CSN.rds"))


#____________________________________________________________________
# PHYLOSEQ OBJECT WITH CENTERED LOG RATIO COUNT
physeq.clr <- physeq.filt
physeq.clr <- microbiome::transform(physeq.filt, "clr") # the function adds pseudocounts itself

# Compare the otu tables in the original phyloseq object and the new one after CLR transformation
otu_table(physeq.filt)[1:5, 1:5] # should contain absolute counts
otu_table(physeq.clr)[1:5, 1:5] # should all be relative

# save the physeq.rel object
saveRDS(physeq.clr, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_clr.rds"))

COMPUTE DISTANCES

1. UniFrac, Aitchison, Bray-Curtis and Canberra

First, let’s look at these four distances of interest.

Now let’s plot!

# Get the distances & the plot data
dist.agp <- getDistances()
plot.df <- plotDistances2D(dist.agp)
# Plot
ggplot(plot.df, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=2, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20))+
  labs(color="Disease")

# ggsave(file.path(path.plots, "distances4_MDS.jpg"), height = 4, width = 15)

2. Plot in 3D

For better visualization, we will also take a glance at reduction to 3D.

Now let’s plot!

plotDistances3D(dist.agp$UniF, "UniFrac")
plotDistances3D(dist.agp$Ait, "Aitchison")
plotDistances3D(dist.agp$Canb, "Canberra")
plotDistances3D(dist.agp$Bray, "Bray-Curtis")

HIERARCHICAL CLUSTERING

# For heatmaps: have group color
matcol <- data.frame(group = sample_data(physeq.filt)[,"host_disease"])


# Function to get heatmap from the distances computed
plotHeatmaps <- function(dlist, fontsize){
  
  # Initialize variables
  i=1
  plist <- vector("list", 4)
  names(plist) <- names(dlist)
  
  # Loop through distances
  for(d in dlist){
    plist[[i]] <- pheatmap(as.matrix(d), 
                          clustering_distance_rows = d,
                          clustering_distance_cols = d,
                          fontsize = fontsize,
                          fontsize_col = fontsize-5,
                          fontsize_row = fontsize-5,
                          annotation_col = matcol,
                          annotation_row = matcol,
                          annotation_colors = list(host_disease = c('Healthy' = 'blue', 'IBS' = 'red')),
                          cluster_rows = T,
                          cluster_cols = T,
                          clustering_method = 'complete', # hc method
                          main = names(dlist)[i]) # have name of distance as title
    i <- i+1
  }
  
  return(plist)
}


# Get the heatmaps
heatmp.agp <- plotHeatmaps(dlist = dist.agp, fontsize = 8)

---
title: "AGP_DataAnalysis"
output:
  html_document:
    df_print: paged
    toc: yes
  pdf_document: default
  html_notebook:
    toc: yes
---

```{r, include=FALSE}
knitr::opts_chunk$set(fig.width = 15, fig.height = 5, warning=FALSE, message=FALSE,
                      root.dir = "~/Projects/IBS_Meta-analysis_16S/analysis-individual/AGP/")
```



############
## IMPORT ##
############

## Librairies

```{r library-import}
library(phyloseq) # for phyloseq object
library(Biostrings)
library(ggplot2)
library(cowplot)
library(tidyverse)
library("plotly") # plot 3D
library("microbiome") # for centered log-ratio
library("coda") # Aitchison distance
library("coda.base") # Aitchison distance
library("vegan") # NMDS
library(pheatmap) # for heatmap
```


## Data

```{r import-data}
# Set path
path <- "~/Projects/IBS_Meta-analysis_16S"

# Import phyloseq object
# physeq.agp <- readRDS(file.path(path, "phyloseq-objects/physeq_AGPfilt.rds"))
physeq.agp <- readRDS(file.path(path, "data/OutputPhangorn/physeq_AGPfilt.rds"))
physeq.agp <- prune_samples(sample_sums(physeq.agp)>=500, physeq.agp)

# Sanity check
physeq.agp
```

Phylogenetic tree was computed with the package `phangorn`, and the script was run on a cluster. Let's check we have correctly generated a phylogenetic tree.

```{r plot-tree, fig.height = 6, fig.width = 10}
# Look at the tree
plot_tree(physeq.agp, color = "host_disease", ladderize="left")
```


```{r import-for-pdf, echo = FALSE}
#__________________________________________________________________________
#____________ THIS IS ONLY FOR (quicker) PDF/HTML OUTPUT __________________
#__________________________________________________________________________

# Import phyloseq objects for computing distances
physeq.rel <- readRDS(file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_relative.rds"))
physeq.clr <- readRDS(file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_clr.rds"))
physeq.CSN <- readRDS(file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_CSN.rds"))
physeq.NZcomp <- readRDS(file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_NZcomp.rds"))

dist.agp <- readRDS(file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/distances.rds"))
plot.df <- readRDS(file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/plot_distances.rds"))
```


################
## ABUNDANCES ##
################

### 1. Absolute abundances

```{r plot-absolute-abundance, fig.width=10, fig.height = 5}
# Plot Phylum
plot_bar(physeq.agp, fill = "Phylum") + facet_wrap("host_disease", scales="free") +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Absolute abundance", title = "AGP dataset")
```

**Sequencing depth** characteristics of the Zhuang dataset:  
- minimum of `r min(sample_sums(physeq.agp))` total count per sample  
- median: `r median(sample_sums(physeq.agp))` total count per sample  
- maximum of `r max(sample_sums(physeq.agp))` total count per sample  


### 2. Relative abundances

```{r plot-relative-abundances}
# Agglomerate to phylum & class levels
phylum.table <- physeq.agp %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

class.table <- physeq.agp %>%
  tax_glom(taxrank = "Class") %>%
  transform_sample_counts(function(x) {x/sum(x)} ) %>%
  psmelt()


# Plot relative abundances
ggplot(phylum.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Bacteroidota', 'Abundance'])),
                         y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Relative abundance", title = "AGP dataset")

ggplot(class.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Bacteroidota', 'Abundance'])),
                        y = Abundance, fill = Class))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Relative abundance", title = "AGP dataset")
```


### 3. Remove bloom sequences

Looking at it more closely, there is a high proportion of Proteobacteria compared to other datasets. Considering that fecal samples were shipped from the participants' home at room temperature, there is evidence that some bacteria grow at room temperature, which would bias the microbial composition (https://journals.asm.org/doi/10.1128/mSystems.00199-16#B5). Authors from the AGP removed a selection of **bloom sequences** that were shared on their [github repository][1] in a FASTA file.

[1]: https://github.com/knightlab-analyses/bloom-analyses

```{r bloom-sequences}
# Import bloom sequences
bloom <- readDNAStringSet(file.path(path, "data/analysis-individual/AGP/newbloom.all.fna"))

# Identify ASVs to keep (ones that are not bloom sequences)
# note: trim ASVs to 150bp as bloom sequences are 150bp (a few ASVs are 151bp)
ASVs_to_keep <- names(setdiff(subseq(refseq(physeq.agp), start=1, width=150), bloom))

# Check taxonomic classification of these bloom sequences
tax_table(physeq.agp) %>%
  as.data.frame %>%
  rownames_to_column(var="ASV") %>%
  filter(!ASV %in% ASVs_to_keep) %>%
  group_by(Class) %>%
  summarize(n=n())

# Remove unwanted taxa (bloom)
physeq.filt <- prune_taxa(ASVs_to_keep, physeq.agp)
physeq.filt <- prune_samples(sample_sums(physeq.filt)>=500, physeq.filt)

phylum.table.filt <- physeq.filt %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format


# PLOT
a <- ggplot(phylum.table, aes(x = host_disease,
                              y = Abundance, fill = Phylum))+
  geom_bar(stat = "identity") +
  theme(legend.position="none")+
  labs(x = "", y = "Relative abundance", title = "Original data")

b <- ggplot(phylum.table.filt, aes(x = host_disease,
                                   y = Abundance, fill = Phylum))+
  geom_bar(stat = "identity") +
  # theme(axis.text.x = element_blank())+
  labs(x = "", y = "Relative abundance", title = "Bloom sequences removed")

gridExtra::grid.arrange(a, b, ncol=2, widths=c(0.4,0.6))
```


### 3. Firmicutes/Bacteroidota ratio

```{r plot-log-ratio, fig.height = 5, fig.width=3}
# Extract abundance of only Bacteroidota and Firmicutes
bacter <- phylum.table.filt %>%
  filter(Phylum == "Bacteroidota") %>%
  select(c('Sample', 'Abundance', 'host_disease', 'Phylum')) %>%
  arrange(Sample)

firmi <- phylum.table.filt %>%
  filter(Phylum == "Firmicutes") %>%
  select(c('Sample', 'Abundance', 'host_disease', 'Phylum')) %>%
  arrange(Sample)

# Calculate log2 ratio Firmicutes/Bacteroidota
ratio.FB <- data.frame('Sample' = bacter$Sample,
                       'host_disease' = bacter$host_disease,
                       'Bacteroidota' = bacter$Abundance,
                       'Firmicutes' = firmi$Abundance)
ratio.FB$logRatioFB <- log2(ratio.FB$Firmicutes / ratio.FB$Bacteroidota)

# Plot log2 ratio Firmicutes/Bacteroidota
ggplot(ratio.FB, aes(x = host_disease, y = logRatioFB))+
  geom_boxplot(outlier.shape = NA)+
  geom_jitter(width=0.1)+
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Firmicutes:Bacteroidota ratio")+
  theme_cowplot()
```


```{r export-plots-1, eval = FALSE, echo = FALSE}
# Save path to plots
path.plots <- file.path(path, "data/analysis-individual/PLOTS/plots-AGP-EDA")

# Absolute abundance phylum
jpeg(file.path(path.plots, "absAbundance_phylum.jpg"), width = 4000, height = 2000)
plot_bar(physeq.filt, fill = "Phylum") + facet_wrap("host_disease", scales="free") +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_text(size = 30),
        legend.text = element_text(size = 50),
        axis.title = element_text(size = 50),
        title = element_text(size = 70),
        strip.text = element_text(size = 50))+
  labs(x = "Samples", y = "Total read count", title = "AGP dataset")
dev.off()

# Relative abundances
jpeg(file.path(path.plots, "relAbundance_phylum.jpg"), width = 4000, height = 2000)
ggplot(phylum.table.filt, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Bacteroidota', 'Abundance'])),
                         y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_text(size = 30),
        legend.text = element_text(size = 50),
        axis.title = element_text(size = 50),
        title = element_text(size = 70),
        strip.text = element_text(size = 50))+
  labs(x = "Samples", y = "Relative abundance", title = "AGP dataset")
dev.off()

jpeg(file.path(path.plots, "relAbundance_class.jpg"), width = 4000, height = 2000)
ggplot(class.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Bacteroidota', 'Abundance'])),
                        y = Abundance, fill = Class))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_text(size = 30),
        legend.text = element_text(size = 50),
        axis.title = element_text(size = 50),
        title = element_text(size = 70),
        strip.text = element_text(size = 50))+
  labs(x = "Samples", y = "Relative abundance", title = "AGP dataset")
dev.off()

# Firmicutes:Bacteroidota log ratio
jpeg(file.path(path.plots, "ratioFB.jpg"), width = 1500, height = 2000)
ggplot(ratio.FB, aes(x = host_disease, y = logRatioFB))+
  geom_boxplot(outlier.shape = NA, lwd = 3)+
  geom_jitter(size = 8, width=0.1)+
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "AGP dataset")+
  theme_classic()+
  theme(axis.text = element_text(size = 50),
        legend.text = element_text(size = 50),
        axis.title = element_text(size = 50),
        title = element_text(size = 50))
dev.off()
```



####################
## NORMALIZE DATA ##
####################

```{r normalize-data, eval = FALSE, echo = TRUE}
# Sanity check no sample with less than 500 total count
table(sample_sums(physeq.filt)<500) # all FALSE

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH NON-ZERO COMPOSITIONS
physeq.NZcomp <- physeq.filt
otu_table(physeq.NZcomp)[otu_table(physeq.NZcomp) == 0] <- 0.5 # pseudocounts

# Sanity check that 0 values have been replaced
# otu_table(physeq.filt)[1:5,1:5]
# otu_table(physeq.NZcomp)[1:5,1:5]

# transform into compositions
physeq.NZcomp <- transform_sample_counts(physeq.NZcomp, function(x) x / sum(x) )
table(rowSums(otu_table(physeq.NZcomp))) # check if there is any row not summing to 1

# Save object
saveRDS(physeq.NZcomp, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_NZcomp.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH RELATIVE COUNT (BETWEEN 0 AND 1)
physeq.rel <- physeq.filt
physeq.rel <- transform_sample_counts(physeq.rel, function(x) x / sum(x) ) # divide each count by the total number of counts (per sample)

# check the counts are all relative
# otu_table(physeq.filt)[1:5, 1:5]
# otu_table(physeq.rel)[1:5, 1:5]

# sanity check
table(rowSums(otu_table(physeq.rel))) # check if there is any row not summing to 1

# save the physeq.rel object
saveRDS(physeq.rel, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_relative.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH COMMON-SCALE NORMALIZATION
physeq.CSN <- physeq.filt
physeq.CSN <- transform_sample_counts(physeq.CSN, function(x) (x*min(sample_sums(physeq.CSN))) / sum(x) )

# sanity check
table(rowSums(otu_table(physeq.CSN))) # check that all rows are summing to the same total

# save the physeq.CSN object
saveRDS(physeq.CSN, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_CSN.rds"))


#____________________________________________________________________
# PHYLOSEQ OBJECT WITH CENTERED LOG RATIO COUNT
physeq.clr <- physeq.filt
physeq.clr <- microbiome::transform(physeq.filt, "clr") # the function adds pseudocounts itself

# Compare the otu tables in the original phyloseq object and the new one after CLR transformation
otu_table(physeq.filt)[1:5, 1:5] # should contain absolute counts
otu_table(physeq.clr)[1:5, 1:5] # should all be relative

# save the physeq.rel object
saveRDS(physeq.clr, file.path(path, "data/analysis-individual/AGP/02_EDA-AGP/physeq_clr.rds"))
```


#######################
## COMPUTE DISTANCES ##
#######################

### 1. UniFrac, Aitchison, Bray-Curtis and Canberra

First, let's look at these four distances of interest.

```{r dist-functions, fig.width = 15, fig.height = 4}
#____________________________________________________________________________________
# Measure distances
getDistances <- function(){
  set.seed(123) # for unifrac, need to set a seed
  glom.UniF <- UniFrac(physeq.rel, weighted=TRUE, normalized=TRUE) # weighted unifrac
  glom.ait <- phyloseq::distance(physeq.clr, method = 'euclidean') # aitchison
  glom.bray <- phyloseq::distance(physeq.CSN, method = "bray") # bray-curtis
  glom.can <- phyloseq::distance(physeq.NZcomp, method = "canberra") # canberra
  dist.list <- list("UniF" = glom.UniF, "Ait" = glom.ait, "Canb" = glom.can, "Bray" = glom.bray)
  
  return(dist.list)
}


#____________________________________________________________________________________
# Plot in 2D the distances
plotDistances2D <- function(dlist, ordination="MDS"){
  plist <- NULL
  plist <- vector("list", 4)
  names(plist) <- c("Weighted Unifrac", "Aitchison", "Bray-Curtis", "Canberra")
  
  print("Unifrac")
  # Weighted UniFrac
  set.seed(123)
  iMDS.UniF <- ordinate(physeq.rel, ordination, distance=dlist$UniF)
  plist[[1]] <- plot_ordination(physeq.rel, iMDS.UniF, color="host_disease")
  
  print("Aitchison")
  # Aitchison
  set.seed(123)
  iMDS.Ait <- ordinate(physeq.clr, ordination, distance=dlist$Ait)
  plist[[2]] <- plot_ordination(physeq.clr, iMDS.Ait, color="host_disease")
  
  print("Bray")
  # Bray-Curtis
  set.seed(123)
  iMDS.Bray <- ordinate(physeq.CSN, ordination, distance=dlist$Bray)
  plist[[3]] <- plot_ordination(physeq.CSN, iMDS.Bray, color="host_disease")
  
  print("Canberra")
  # Canberra
  set.seed(123)
  iMDS.Can <- ordinate(physeq.NZcomp, ordination, distance=dlist$Can)
  plist[[4]] <- plot_ordination(physeq.NZcomp, iMDS.Can, color="host_disease")
  
  # Creating a dataframe to plot everything
  plot.df = ldply(plist, function(x) x$data)
  names(plot.df)[1] <- "distance"
  
  return(plot.df)
}
```


Now let's plot!

```{r dist-run-1, echo=TRUE, eval=FALSE}
# Get the distances & the plot data
dist.agp <- getDistances()
plot.df <- plotDistances2D(dist.agp)
```

```{r dist-run-2}
# Plot
ggplot(plot.df, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=2, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20))+
  labs(color="Disease")

# ggsave(file.path(path.plots, "distances4_MDS.jpg"), height = 4, width = 15)
```


```{r tong, echo=FALSE, eval=FALSE, include=FALSE}
# Share with Tong
plot_ordination(physeq.CSN,
                ordinate(physeq.CSN, "PCoA", distance=dist.agp$Bray),
                color="host_disease") +
  theme_bw()
ggsave(file.path(path.plots, "PCoA_bray.jpg"), height = 5, width = 7)
```



### 2. Plot in 3D

For better visualization, we will also take a glance at reduction to 3D.

```{r 3D-function, eval = TRUE, echo = TRUE}
#____________________________________________________________________________________
# Plot 3D ordination
plotDistances3D <- function(d, name_dist){
  
  # Reset parameters
  mds.3D <- NULL
  xyz <- NULL
  fig.3D <- NULL
  
  # Reduce distance matrix to 3 dimensions
  set.seed(123)
  mds.3D <- metaMDS(d, method="MDS", k=3, trace = 0)
  xyz <- scores(mds.3D, display="sites") # pull out the (x,y,z) coordinates
  
  # Plot
  fig.3D <- plot_ly(x=xyz[,1], y=xyz[,2], z=xyz[,3], type="scatter3d", mode="markers",
                    color=sample_data(physeq.filt)$host_disease, colors = c("blue", "red"))%>%
    layout(title = paste('MDS in 3D with', name_dist, 'distance', sep = ' '))
  
  return(fig.3D)
}

```


Now let's plot!

```{r 3D-plot, fig.width = 8, fig.height = 5}
plotDistances3D(dist.agp$UniF, "UniFrac")
plotDistances3D(dist.agp$Ait, "Aitchison")
plotDistances3D(dist.agp$Canb, "Canberra")
plotDistances3D(dist.agp$Bray, "Bray-Curtis")
```




#############################
## HIERARCHICAL CLUSTERING ##
#############################

```{r hierarchical-clust, fig.width = 25, fig.width = 7}
# For heatmaps: have group color
matcol <- data.frame(group = sample_data(physeq.filt)[,"host_disease"])


# Function to get heatmap from the distances computed
plotHeatmaps <- function(dlist, fontsize){
  
  # Initialize variables
  i=1
  plist <- vector("list", 4)
  names(plist) <- names(dlist)
  
  # Loop through distances
  for(d in dlist){
    plist[[i]] <- pheatmap(as.matrix(d), 
                          clustering_distance_rows = d,
                          clustering_distance_cols = d,
                          fontsize = fontsize,
                          fontsize_col = fontsize-5,
                          fontsize_row = fontsize-5,
                          annotation_col = matcol,
                          annotation_row = matcol,
                          annotation_colors = list(host_disease = c('Healthy' = 'blue', 'IBS' = 'red')),
                          cluster_rows = T,
                          cluster_cols = T,
                          clustering_method = 'complete', # hc method
                          main = names(dlist)[i]) # have name of distance as title
    i <- i+1
  }
  
  return(plist)
}


# Get the heatmaps
heatmp.agp <- plotHeatmaps(dlist = dist.agp, fontsize = 8)
```


